package com.zftlive.spider.sample;

import java.util.LinkedList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.zftlive.spider.tool.ToolDateTime;
import com.zftlive.spider.tool.ToolFile;

public class JsoupSample {
  
  /**
   * 分页标签样式标记
   */
  public final static String CSS_PAGENATION_BAR = "pagination";//"page-main";
  
  /**
   * 文章详情样式标记
   */
  public final static String CSS_DETAIL_CONTENT = "a-content-wrap";
  
  /**
   * 网络连接超时时间，默认10s
   */
  public final static int TIME_OUT = 10 * 1000;

  /**
   * 文件输出根目录
   */
  private static String filePathRoot = "";
  
  public static void main(String[] args) {
    //文件存储目录
    filePathRoot = System.getProperty("user.dir")+"/";
    gainDetailPageURL("http://bbs.byr.cn/#!board/JobInfo");
//    extractDetailContent("http://bbs.byr.cn/#!article/JobInfo/581020?_uid=guest");
  }
  
  /**
   * 获取详情页面地址集合
   * @return
   */
  public static List<String> gainDetailPageURL(String strListPageURL){
    List<String> mDetailPageURLs = new LinkedList<String>();
    try {
      try {
        //加载网络网页DOM
        Document html = Jsoup.connect(strListPageURL)
//            .userAgent("Mozilla")
//            .timeout(TIME_OUT)
            .get();
        
        ToolFile.write(filePathRoot+"List-"+ToolDateTime.gainCurrentDate("yyyyMMddHHmmss")+".html", html.html().getBytes());
        
//        Elements elements = html.select("div.page");
        
        //寻找目标文本
        Elements elements = html.body().getElementsByClass(CSS_PAGENATION_BAR);
        
        
        if(null == elements || elements.size() <= 0){
          System.err.println("-->未找到分页栏样式["+CSS_PAGENATION_BAR+"]");
          return mDetailPageURLs;
        }
        for (Element element : elements) {
          Elements childs =   element.children();
          if(null !=  childs){
            for (Element element2 : childs) {
              System.out.println("-->孩子节点html：\n"+element.html()+"\n");
            }
          }
          System.out.println("-->分页栏html：\n"+element.html()+"\n");
        }
        String strPagetionNavBar =  elements.get(0).html();
//        System.out.println("-->分页栏html：\n"+strPagetionNavBar+"\n");
        System.out.println("-->查找结束...");
      } catch (Exception e) {
        e.printStackTrace();
        System.out.println("-->抓取分页栏数据发生异常，原因：+"+e.getMessage());
      }
    } catch (Exception e) {
      e.printStackTrace();
      System.err.println("提取列表页面分页栏数据失败，原因:"+e.getMessage());
    }
    return mDetailPageURLs;
  }
  
  /**
   * 提取详情页内容文本
   * 
   * @param detailURL 详情页面地址
   * @return
   */
  public static String extractDetailContent(String detailURL){
    String strContentText = "";
    
    try {
      //加载网络网页DOM
      Document html = Jsoup.connect(detailURL)
//          .userAgent("Mozilla")
//          .timeout(TIME_OUT)
          .get();
      
      ToolFile.write(filePathRoot+"Detail-"+ToolDateTime.gainCurrentDate("yyyyMMddHHmmss")+".html", html.html().getBytes());
      
      //寻找目标文本
      Elements elements = html.getElementsByClass(CSS_DETAIL_CONTENT);
      if(null == elements || elements.size() <= 0){
        System.err.println("-->未找到详情内容样式["+CSS_DETAIL_CONTENT+"]");
        return "";
      }
      strContentText =  elements.get(0).html();
      System.out.println("-->详情内容html：\n"+strContentText+"\n");
      System.out.println("-->查找结束...");
    } catch (Exception e) {
      e.printStackTrace();
      System.out.println("-->抓取数据发生异常，原因：+"+e.getMessage());
    }
    return strContentText;
  }
}
